{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ai/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint8 = np.dtype([(\"qint8\", np.int8, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint8 = np.dtype([(\"quint8\", np.uint8, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint16 = np.dtype([(\"qint16\", np.int16, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_quint16 = np.dtype([(\"quint16\", np.uint16, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  _np_qint32 = np.dtype([(\"qint32\", np.int32, 1)])\n",
      "/home/ai/anaconda3/lib/python3.6/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.\n",
      "  np_resource = np.dtype([(\"resource\", np.ubyte, 1)])\n"
     ]
    }
   ],
   "source": [
    "import os, sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm\n",
    "import PIL\n",
    "from PIL import Image, ImageOps\n",
    "import cv2\n",
    "from sklearn.utils import class_weight, shuffle\n",
    "from keras.losses import binary_crossentropy\n",
    "from keras.applications.resnet50 import preprocess_input\n",
    "import keras.backend as K\n",
    "import tensorflow as tf\n",
    "from sklearn.metrics import f1_score, fbeta_score\n",
    "from keras.utils import Sequence\n",
    "from keras.utils import to_categorical\n",
    "from sklearn.model_selection import train_test_split\n",
    "from tqdm import tqdm_notebook\n",
    "\n",
    "WORKERS = 2\n",
    "CHANNEL = 3\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "IMG_SIZE = 512\n",
    "NUM_CLASSES = 5\n",
    "SEED = 77\n",
    "TRAIN_NUM = 1000 # use 1000 when you just want to explore new idea, use -1 for full train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction. Explore first, train later.\n",
    "\n",
    "Hi everyone! As Aravind Eye Hospital is one of my favorite organization in the world; they take care of poor people's eyes for free with an impressive sustainable business model. I will try my best to contribute something to our community. One intuitive way to improve the performance of our model is to simply improve the quality of input images. In this kernel, I will share two ideas which I hope may be useful to some of you :\n",
    "\n",
    "- **Reducing lighting-condition effects** : as we will see, images come with many different lighting conditions, some images are very dark and difficult to visualize. We can try to convert the image to gray scale, and visualize better. Alternatively, there is a better approach. We can try the method of Ben Graham (last competition's winner)\n",
    "- **Cropping uninformative area**: everyone know this :) Here, I just find the codes from internet and choose the best one for you :)\n",
    "\n",
    "We are going to apply both techniques to both the official data, and the past competition data (shout out @tanlikesmath for creating this dataset! https://www.kaggle.com/tanlikesmath/diabetic-retinopathy-resized . In the updated version, I also try @donkeys' dataset https://www.kaggle.com/donkeys/retinopathy-train-2015 , which is .png which may be have higer image quality than .jpeg format)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('../input/aptos2019-blindness-detection/train.csv')\n",
    "df_test = pd.read_csv('../input/aptos2019-blindness-detection/test.csv')\n",
    "\n",
    "x = df_train['id_code']\n",
    "y = df_train['diagnosis']\n",
    "\n",
    "x, y = shuffle(x, y, random_state=SEED)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3112,) (3112,) (550,) (550,)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f608cc94588>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD8CAYAAAB+UHOxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAF2hJREFUeJzt3X+QHOV95/H3xxK/jBIkI7OnrJSsfJF9IVZ+wJ4sxxXXyEpAgAtRdaYijhhBSG1djB3fgQuLuCqqOEWF3IVgw+Wc2hgdIlFYE+yLFCOHKMCESlUkg7CNkGXMGuvQgsLaFshZg3HJ+eaPebAn69md6e6dGUnP51U1pe7nebr7262d/Wz3TM8oIjAzs/y8rt8FmJlZfzgAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8tU2wCQtEXSpKQnp7V/QNJTkvZL+p9N7TdJGk99Fza1r0tt45I2ze1umJlZUWp3I5ikdwJTwN0R8dbUtgb4CHBJRLwq6ZyImJR0LnAPsAr4CeDvgTenVX0V+FVgAngUuCIivtyFfTIzsw7MbzcgIh6RNDSt+beAWyLi1TRmMrWvB8ZS+9cljdMIA4DxiHgGQNJYGjtrACxevDiGhqZvunPf+c53OPPMM0sv3y2uqxjXVYzrKuZkrGvv3r3fjIg3thvXNgBm8GbglyXdDHwX+FBEPAoMArubxk2kNoBD09rf1m4jQ0NDPPbYYyVLhHq9Tq1WK718t7iuYlxXMa6rmJOxLkn/v5NxZQNgPrAIWA38Z+BeSW8C1GJs0Pq1hpbXniSNACMAAwMD1Ov1kiXC1NRUpeW7xXUV47qKcV3FZF1XRLR9AEPAk03zfwvUmua/BrwRuAm4qan9AeDt6fFAU/u/GzfT4/zzz48qHn744UrLd4vrKsZ1FeO6ijkZ6wIeiw5+t5d9G+hfA+8CkPRm4FTgm8AOYIOk0yQtB1YAn6fxou8KScslnQpsSGPNzKxP2l4CknQPUAMWS5oANgNbgC3praHfAzam1Nkv6V4aL+4eA66LiO+n9byfxhnBPGBLROzvwv6YmVmHOnkX0BUzdP36DONvBm5u0b4T2FmoOjMz6xrfCWxmlikHgJlZphwAZmaZcgCYmWWq7I1gJ4R9zx3l6k3393y7B2+5pOfbNDMrymcAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZcgCYmWXKAWBmlikHgJlZphwAZmaZahsAkrZImkzf/zu970OSQtLiNC9Jt0sal/SEpPOaxm6U9HR6bJzb3TAzs6I6OQO4C1g3vVHSMuBXgWebmi8CVqTHCPCJNPYNNL5M/m3AKmCzpEVVCjczs2raBkBEPAIcadF1G3AjEE1t64G7o2E3sFDSEuBCYFdEHImIF4FdtAgVMzPrnVKvAUi6FHguIr40rWsQONQ0P5HaZmo3M7M+KfyNYJJeD3wEuKBVd4u2mKW91fpHaFw+YmBggHq9XrTEHxg4A25Yeaz08mW1q3lqaqrSfnWL6yrGdRXjuorpRV1lvhLyPwLLgS9JAlgKPC5pFY2/7Jc1jV0KPJ/aa9Pa661WHhGjwCjA8PBw1Gq1VsM6cse27dy6r/ffennwytqs/fV6nSr71S2uqxjXVYzrKqYXdRW+BBQR+yLinIgYioghGr/cz4uIfwZ2AFeldwOtBo5GxGHgAeACSYvSi78XpDYzM+uTTt4Geg/wT8BbJE1IunaW4TuBZ4Bx4M+A9wFExBHg94FH0+Ojqc3MzPqk7fWRiLiiTf9Q03QA180wbguwpWB9ZmbWJb4T2MwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMOQDMzDLlADAzy5QDwMwsUw4AM7NMdfKdwFskTUp6sqntf0n6iqQnJP0/SQub+m6SNC7pKUkXNrWvS23jkjbN/a6YmVkRnZwB3AWsm9a2C3hrRPwc8FXgJgBJ5wIbgJ9Ny/wfSfMkzQP+BLgIOBe4Io01M7M+aRsAEfEIcGRa299FxLE0uxtYmqbXA2MR8WpEfB0YB1alx3hEPBMR3wPG0lgzM+uTuXgN4DeAz6XpQeBQU99Eapup3czM+mR+lYUlfQQ4Bmx7ranFsKB10MQM6xwBRgAGBgao1+ul6xs4A25Yeaz9wDnWruapqalK+9UtrqsY11WM6yqmF3WVDgBJG4F3A2sj4rVf5hPAsqZhS4Hn0/RM7f9ORIwCowDDw8NRq9XKlsgd27Zz675KGVfKwStrs/bX63Wq7Fe3uK5iXFcxrquYXtRV6hKQpHXAh4FLI+Llpq4dwAZJp0laDqwAPg88CqyQtFzSqTReKN5RrXQzM6ui7Z/Hku4BasBiSRPAZhrv+jkN2CUJYHdE/LeI2C/pXuDLNC4NXRcR30/reT/wADAP2BIR+7uwP2Zm1qG2ARARV7RovnOW8TcDN7do3wnsLFSdmZl1je8ENjPLlAPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTbQNA0hZJk5KebGp7g6Rdkp5O/y5K7ZJ0u6RxSU9IOq9pmY1p/NOSNnZnd8zMrFOdnAHcBayb1rYJeDAiVgAPpnmAi4AV6TECfAIagUHjy+TfBqwCNr8WGmZm1h9tAyAiHgGOTGteD2xN01uBy5ra746G3cBCSUuAC4FdEXEkIl4EdvGjoWJmZj1U9jWAgYg4DJD+PSe1DwKHmsZNpLaZ2s3MrE/mz/H61KItZmn/0RVIIzQuHzEwMEC9Xi9dzMAZcMPKY6WXL6tdzVNTU5X2q1tcVzGuqxjXVUwv6iobAC9IWhIRh9MlnsnUPgEsaxq3FHg+tdemtddbrTgiRoFRgOHh4ajVaq2GdeSObdu5dd9cZ1x7B6+szdpfr9epsl/d4rqKcV3FuK5ielFX2UtAO4DX3smzEdje1H5VejfQauBoukT0AHCBpEXpxd8LUpuZmfVJ2z+PJd1D46/3xZImaLyb5xbgXknXAs8Cl6fhO4GLgXHgZeAagIg4Iun3gUfTuI9GxPQXls3MrIfaBkBEXDFD19oWYwO4bob1bAG2FKrOzMy6xncCm5llygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaZ6/32JZieJfc8d5epN9/d8uwdvuaTn27STk88AzMwy5QAwM8tUpQCQ9D8k7Zf0pKR7JJ0uabmkPZKelvQpSaemsael+fHUPzQXO2BmZuWUDgBJg8BvA8MR8VZgHrAB+EPgtohYAbwIXJsWuRZ4MSJ+GrgtjTMzsz6pegloPnCGpPnA64HDwLuA+1L/VuCyNL0+zZP610pSxe2bmVlJpQMgIp4D/gh4lsYv/qPAXuCliDiWhk0Ag2l6EDiUlj2Wxp9ddvtmZlaNIqLcgtIi4NPArwEvAX+V5jenyzxIWgbsjIiVkvYDF0bEROr7GrAqIr41bb0jwAjAwMDA+WNjY6XqA5g8cpQXXim9eGkrB8+atX9qaooFCxb0qJrOua5i/PNVjOsqpkpda9as2RsRw+3GVbkP4FeAr0fENwAkfQb4JWChpPnpr/ylwPNp/ASwDJhIl4zOAo5MX2lEjAKjAMPDw1Gr1UoXeMe27dy6r/e3Ohy8sjZrf71ep8p+dYvrKsY/X8W4rmJ6UVeV1wCeBVZLen26lr8W+DLwMPCeNGYjsD1N70jzpP6Houzph5mZVVblNYA9NF7MfRzYl9Y1CnwYuF7SOI1r/HemRe4Ezk7t1wObKtRtZmYVVTp/jYjNwOZpzc8Aq1qM/S5weZXtmZnZ3PGdwGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllygFgZpYpB4CZWaYcAGZmmXIAmJllqlIASFoo6T5JX5F0QNLbJb1B0i5JT6d/F6WxknS7pHFJT0g6b252wczMyqh6BvBx4G8j4j8BPw8coPFl7w9GxArgQX745e8XASvSYwT4RMVtm5lZBaUDQNKPA+8E7gSIiO9FxEvAemBrGrYVuCxNrwfujobdwEJJS0pXbmZmlVQ5A3gT8A3g/0r6gqRPSjoTGIiIwwDp33PS+EHgUNPyE6nNzMz6QBFRbkFpGNgNvCMi9kj6OPBt4AMRsbBp3IsRsUjS/cAfRMQ/pvYHgRsjYu+09Y7QuETEwMDA+WNjY6XqA5g8cpQXXim9eGkrB8+atX9qaooFCxb0qJrOua5i/PNVjOsqpkpda9as2RsRw+3GzS+19oYJYCIi9qT5+2hc739B0pKIOJwu8Uw2jV/WtPxS4PnpK42IUWAUYHh4OGq1WukC79i2nVv3VdnFcg5eWZu1v16vU2W/usV1FeOfr2JcVzG9qKv0JaCI+GfgkKS3pKa1wJeBHcDG1LYR2J6mdwBXpXcDrQaOvnapyMzMeq/qny8fALZJOhV4BriGRqjcK+la4Fng8jR2J3AxMA68nMaamVmfVAqAiPgi0Oo609oWYwO4rsr2zMxs7vhOYDOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwyVTkAJM2T9AVJn03zyyXtkfS0pE+l7wtG0mlpfjz1D1XdtpmZlTcXZwAfBA40zf8hcFtErABeBK5N7dcCL0bETwO3pXFmZtYnlQJA0lLgEuCTaV7Au4D70pCtwGVpen2aJ/WvTePNzKwPqp4BfAy4EfjXNH828FJEHEvzE8Bgmh4EDgGk/qNpvJmZ9YEiotyC0ruBiyPifZJqwIeAa4B/Spd5kLQM2BkRKyXtBy6MiInU9zVgVUR8a9p6R4ARgIGBgfPHxsbK7RkweeQoL7xSevHSVg6eNWv/1NQUCxYs6FE1nXNdxfjnqxjXVUyVutasWbM3IobbjZtfau0N7wAulXQxcDrw4zTOCBZKmp/+yl8KPJ/GTwDLgAlJ84GzgCPTVxoRo8AowPDwcNRqtdIF3rFtO7fuq7KL5Ry8sjZrf71ep8p+dYvrKsY/X8W4rmJ6UVfpS0ARcVNELI2IIWAD8FBEXAk8DLwnDdsIbE/TO9I8qf+hKHv6YWZmlXXjPoAPA9dLGqdxjf/O1H4ncHZqvx7Y1IVtm5lZh+bk/DUi6kA9TT8DrGox5rvA5XOxPTMzq853ApuZZcoBYGaWKQeAmVmmHABmZplyAJiZZcoBYGaWqd7fxmhdNbTp/tLL3rDyGFeXXP7gLZeU3q6Z9YfPAMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMnVSvwto5eu+zsHTN/dhy0f7sE0zs2J8BmBmlikHgJlZpk7qS0BmZlVVubmyirvWndn1bfgMwMwsUw4AM7NMlQ4AScskPSzpgKT9kj6Y2t8gaZekp9O/i1K7JN0uaVzSE5LOm6udMDOz4qqcARwDboiInwFWA9dJOpfGl70/GBErgAf54Ze/XwSsSI8R4BMVtm1mZhWVDoCIOBwRj6fpfwEOAIPAemBrGrYVuCxNrwfujobdwEJJS0pXbmZmlczJawCShoBfBPYAAxFxGBohAZyThg0Ch5oWm0htZmbWB4qIaiuQFgD/ANwcEZ+R9FJELGzqfzEiFkm6H/iDiPjH1P4gcGNE7J22vhEal4gYGBg4f2xsrHRtU0cmWfDq86WXL23JL8zaPTU1xYIFC7qy6X3Plb8LeeAMeOGVcsuuHDyr9Hbb6ebxqmLyyNHSx6uKdsf6eD1eJ2pdVZ5TVSw/a17p47VmzZq9ETHcblyl+wAknQJ8GtgWEZ9JzS9IWhIRh9MlnsnUPgEsa1p8KfAjv50jYhQYBRgeHo5arVa6vvo9H6P2VB8+CuKK2X9g6vU6VfZrNmW/0AUaXwhz675yPxIHr6yV3m473TxeVdyxbXvp41VFu2N9vB6vE7WuKs+pKu5ad2bXj1eVdwEJuBM4EBF/3NS1A9iYpjcC25var0rvBloNHH3tUpGZmfVelT9f3gG8F9gn6Yup7XeAW4B7JV0LPAtcnvp2AhcD48DLwDUVtm1mZhWVDoB0LV8zdK9tMT6A68puz8zM5pY/C8jMOlblc3FuWHms9PX0g7dcUnq7NjN/FISZWaZ8BmBWkr9wyE50PgMwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTDkAzMwy5QAwM8uUA8DMLFMOADOzTPmzgE4yB0//r6WXrb/u9yp8to0/n8bsROMzADOzTPkMwMxsFlXOqquo/+DbdLvHZwBmZpnq+RmApHXAx4F5wCcj4pZe12Bm5fg1ppNLT88AJM0D/gS4CDgXuELSub2swczMGnp9CWgVMB4Rz0TE94AxYH2PazAzM3ofAIPAoab5idRmZmY9pojo3caky4ELI+I30/x7gVUR8YGmMSPASJp9C/BUhU0uBr5ZYflucV3FuK5iXFcxJ2NdPxURb2w3qNcvAk8Ay5rmlwLPNw+IiFFgdC42JumxiBiei3XNJddVjOsqxnUVk3Ndvb4E9CiwQtJySacCG4AdPa7BzMzo8RlARByT9H7gARpvA90SEft7WYOZmTX0/D6AiNgJ7OzR5ubkUlIXuK5iXFcxrquYbOvq6YvAZmZ2/PBHQZiZZeqEDwBJ6yQ9JWlc0qYW/adJ+lTq3yNp6Dip62pJ35D0xfT4zR7VtUXSpKQnZ+iXpNtT3U9IOu84qasm6WjT8frdHtW1TNLDkg5I2i/pgy3G9PyYdVhXz4+ZpNMlfV7Sl1Jdv9diTM+fkx3W1ZfnZNr2PElfkPTZFn3dO14RccI+aLyQ/DXgTcCpwJeAc6eNeR/wp2l6A/Cp46Suq4H/3Ydj9k7gPODJGfovBj4HCFgN7DlO6qoBn+3D8VoCnJemfwz4aov/y54fsw7r6vkxS8dgQZo+BdgDrJ42ph/PyU7q6stzMm37euAvW/1/dfN4nehnAJ18tMR6YGuavg9YK0nHQV19ERGPAEdmGbIeuDsadgMLJS05Durqi4g4HBGPp+l/AQ7wo3ev9/yYdVhXz6VjMJVmT0mP6S809vw52WFdfSFpKXAJ8MkZhnTteJ3oAdDJR0v8YExEHKPxsYJnHwd1AfyXdMngPknLWvT3w/H8cR1vT6fwn5P0s73eeDr1/kUafz026+sxm6Uu6MMxS5czvghMArsiYsbj1cPnZCd1QX+ekx8DbgT+dYb+rh2vEz0AWqXg9FTvZMxc62SbfwMMRcTPAX/PDxO+3/pxvDrxOI3b238euAP4615uXNIC4NPAf4+Ib0/vbrFIT45Zm7r6cswi4vsR8Qs07vRfJemt04b05Xh1UFfPn5OS3g1MRsTe2Ya1aJuT43WiB0Dbj5ZoHiNpPnAW3b/U0MlHXnwrIl5Ns38GnN/lmjrVyTHtuYj49mun8NG4l+QUSYt7sW1Jp9D4JbstIj7TYkhfjlm7uvp5zNI2XwLqwLppXf14Tratq0/PyXcAl0o6SONS8bsk/cW0MV07Xid6AHTy0RI7gI1p+j3AQ5FeTelnXdOuEV9K4xru8WAHcFV6Z8tq4GhEHO53UZL+w2vXPSWtovGz+60ebFfAncCBiPjjGYb1/Jh1Ulc/jpmkN0pamKbPAH4F+Mq0YT1/TnZSVz+ekxFxU0QsjYghGr8nHoqIX582rGvH64T+TuCY4aMlJH0UeCwidtB4kvy5pHEaqbnhOKnrtyVdChxLdV3d7boAJN1D490hiyVNAJtpvCBGRPwpjbu0LwbGgZeBa46Tut4D/JakY8ArwIYeBDk0/kJ7L7AvXT8G+B3gJ5tq68cx66SufhyzJcBWNb786XXAvRHx2X4/Jzusqy/PyVZ6dbx8J7CZWaZO9EtAZmZWkgPAzCxTDgAzs0w5AMzMMuUAMDPLlAPAzCxTDgAzs0w5AMzMMvVv8JlUGH38gT8AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.15,\n",
    "                                                      stratify=y, random_state=SEED)\n",
    "print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)\n",
    "train_y.hist()\n",
    "valid_y.hist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simple picture to explain Diabetic Retinopathy\n",
    "How do we know that a patient have diabetic retinopahy? There are at least 5 things to spot on. Image credit https://www.eyeops.com/\n",
    "![image.png](https://sa1s3optim.patientpop.com/assets/images/provider/photos/1947516.jpeg)\n",
    "\n",
    "\n",
    "From quick investigations of the data (see various pictures below), I found that Hemorrphages, Hard Exudates and Cotton Wool spots are quite easily observed. However, I still could not find examples of Aneurysm or Abnormal Growth of Blood Vessels from our data yet. Perhaps the latter two cases are important if we want to catch up human benchmnark using our model.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "# Further improve by auto-cropping\n",
    "\n",
    "To crop out the uninformative black areas which are evident on pic(0,1), pic(0,3) and pic(4,1), we can try auto cropping. I found 4 alternative codes from https://stackoverflow.com/questions/13538748/crop-black-edges-with-opencv and https://codereview.stackexchange.com/questions/132914/crop-black-border-of-image-using-numpy/132934 ... Fortunately one method works perfectly for a gray scale image, but none works on a color image. In this kernel, I modify the method working on gray-scale a bit to make it suitable for a color image.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def crop_image_from_gray(img,tol=7):\n",
    "    if img.ndim ==2:\n",
    "        mask = img>tol\n",
    "        return img[np.ix_(mask.any(1),mask.any(0))]\n",
    "    elif img.ndim==3:\n",
    "        gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)\n",
    "        mask = gray_img>tol\n",
    "        \n",
    "        check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]\n",
    "        if (check_shape == 0): # image is too dark so that we crop out everything,\n",
    "            return img # return original image\n",
    "        else:\n",
    "            img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]\n",
    "            img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]\n",
    "            img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]\n",
    "            img = np.stack([img1,img2,img3],axis=-1)\n",
    "        return img"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "Try Cropping the images\n",
    "\n",
    "I have tested on around 200 images, and the method works great. However, if anybody find the outlier cases which cause the auto crop to fail, please let me know. I think now the eye pictures are very like the moon by the way :)\n",
    "\n",
    "**IMPORTANT UPDATE** on Kernel V.9 I found that there is indeed a case in private test set making the old version of crop function fail. (I spent my 13 submissions until I found this bug) E.g. if there is an adversarial image (super dark) in the private test set, the crop function will crop everything and result in 0 dimension image. I have fixed this bug in this kernel version, but I still could not guarantee whether there are other cases in a private test that will make the crop function fail or not. Update on V11 Now I was able to have a valid LB score with the new crop function, so if anybody still have some submission errors, that is the reason of other bugs.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "# A Important Update on Color Version of Cropping & Ben's Preprocessing\n",
    "\n",
    "At first, when I wrote this kernel, I could not make a color crop nicely, so I thought that gray scale is a better representation. Now I believe that color version is better, so from this point on I will use color cropping\n",
    "\n",
    "Below is the cropped of the color version. For color version, note that I use argument sigmaX = 30 of cv2.GaussianBlur, where Ben actually used sigmaX = 10 which may have better performance. I just feel that this sigmaX = 30 or sigmaX = 50 make beautiful [sometimes bloody] yellow moon pictures. Just for the purpose of illustration.\n",
    "\n",
    "Please refer to https://www.tutorialkart.com/opencv/python/opencv-python-gaussian-image-smoothing/ .\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_ben_color(path, sigmaX=10):\n",
    "    image = cv2.imread(path)\n",
    "    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
    "    image = crop_image_from_gray(image)\n",
    "    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))\n",
    "    image=cv2.addWeighted ( image,4, cv2.GaussianBlur( image , (0,0) , sigmaX) ,-4 ,128)\n",
    "        \n",
    "    return image"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_old = pd.read_csv('../input/diabetic-retinopathy-resized/trainLabels.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "29b9021ffdd540339433753b063dbc96",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "save_dir = \"../input/diabetic-retinopathy-resized/ben_preprocessing_sigmaX10\"\n",
    "if not os.path.exists(save_dir):\n",
    "    os.mkdir(save_dir)\n",
    "\n",
    "for idx, row in tqdm_notebook(df_old.iterrows()):\n",
    "    path=f\"../input/diabetic-retinopathy-resized/resized_train/{row['image']}.jpeg\"\n",
    "    image = load_ben_color(path,sigmaX=10)\n",
    "    Image.fromarray(image).save(os.path.join(save_dir, \"{}.png\".format(row['image'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ce0ab473e0a5421d9fb03c67b936f07c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "save_dir = \"../input/aptos2019-blindness-detection/train_images_ben_preprocessing_sigmaX10\"\n",
    "if not os.path.exists(save_dir):\n",
    "    os.mkdir(save_dir)\n",
    "\n",
    "for idx, row in tqdm_notebook(df_train.iterrows()):\n",
    "    path=f\"../input/aptos2019-blindness-detection/train_images/{row['id_code']}.png\"\n",
    "    image = load_ben_color(path,sigmaX=10)\n",
    "    Image.fromarray(image).save(os.path.join(save_dir, \"{}.png\".format(row['id_code'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cf4228d45b2f41c8823433ae32fa299d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "save_dir = \"../input/aptos2019-blindness-detection/test_images_ben_preprocessing_sigmaX10\"\n",
    "if not os.path.exists(save_dir):\n",
    "    os.mkdir(save_dir)\n",
    "\n",
    "for idx, row in tqdm_notebook(df_test.iterrows()):\n",
    "    path=f\"../input/aptos2019-blindness-detection/test_images/{row['id_code']}.png\"\n",
    "    image = load_ben_color(path,sigmaX=10)\n",
    "    Image.fromarray(image).save(os.path.join(save_dir, \"{}.png\".format(row['id_code'])))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
